package edu.zut.cs.network;

import java.util.ArrayList;

import java.util.Map;

import java.util.Set;

import java.util.TreeMap;

import java.util.TreeSet;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

/**
 * 
 * <pre>

 * 统计文章中出现次数最多的中文汉字或英文单词

 * 

 * 	调用参数说明：

 * 		String str  需要统计的文章

 * 		int index  需要返回的最多的前几个

 *     return 返回值为一个字符串数组，数组中字符起始数字就是该汉字或单词出现的次数
 * 
 * </pre>
 * 
 * 
 * 
 * @author kaifang
 * 
 * 
 * 
 */

public class ChineseSearch {
	// 统计汉字
	public static String[] countToZH(String str, int index) {
		// 去掉中间包含的空格、中文逗号、中文句号
		str = str.replace(" ", "").replace("，", "").replace("。", "");
		// 定义返回数组
		String[] re_str = new String[index];
		// 文本转换为字符数组
		char[] chs = str.toCharArray();
		// 定义ArrayList对象存储汉字
		ArrayList<String> array = new ArrayList<String>();
		for (char ch : chs) {
			array.add(String.valueOf(ch));
		}
		// 定义Map集合存储汉字，键为汉字不重复，值为统计的数量
		TreeMap<String, Integer> map = new TreeMap<String, Integer>();
		// 遍历字符数组，获取到每一个字符
		for (String tstr : array) {
			// 用每一个字符作为键，在TreeMap中查找
			Integer val = map.get(tstr);
			if (val == null) {
				// 返回null，则不存在，存储1
				map.put(tstr, 1);
			} else {
				// 返回非null，则把值加1，重新存储
				val++;
				map.put(tstr, val);
			}
		}
		// key value拼接后存在TreeSet中会自动排序，将value与key拼接key在前边
		TreeSet<String> sortSet = new TreeSet<String>();
		// 获取键值对的Set集合
		Set<Map.Entry<String, Integer>> sme = map.entrySet();
		// 遍历拼接
		for (Map.Entry<String, Integer> me : sme) {
			String s = me.getValue().toString() + me.getKey();
			sortSet.add(s);
		}
		// 获取后出现次数最多的index个单词，带有出现次数
		int o = sortSet.size();
		// 记数
		int c = 0;
		for (int i = o - index; i < sortSet.size();) {
			String te = sortSet.last();
			sortSet.remove(te);
			String temp = (o - sortSet.size()) + "：" + te.replaceAll("[^\\d]", "") + "  " + te.replaceAll("[\\d+]", "");
			re_str[c++] = temp;
		}
		return re_str;
	}

	// 统计英文单词
	public static String[] countToEng(String str, int index) {
		// 定义返回数组
		String[] re_str = new String[index];
		// 定义ArrayList对象存储匹配到的单词
		ArrayList<String> array = new ArrayList<String>();
		// 使用正则获取单词
		Pattern pattern = Pattern.compile("\\b[\\w+\\-']+\\b");
		Matcher matcher = pattern.matcher(str);
		while (matcher.find()) {
			array.add(matcher.group());
		}
		// 定义Map集合存储单词，键为单词不重复，值为统计的数量
		TreeMap<String, Integer> map = new TreeMap<String, Integer>();
		// 遍历字符数组，获取到每一个字符
		for (String tstr : array) {
			// 用每一个字符作为键，在TreeMap中查找
			Integer val = map.get(tstr);
			if (val == null) {
				// 返回null，则不存在，存储1
				map.put(tstr, 1);
			} else {
				// 返回非null，则把值加1，重新存储
				val++;
				map.put(tstr, val);
			}
		}
		// key value拼接后存在TreeSet中会自动排序，将value与key拼接key在前边
		TreeSet<String> sortSet = new TreeSet<String>();
		// 获取键值对的Set集合
		Set<Map.Entry<String, Integer>> sme = map.entrySet();
		// 遍历拼接
		for (Map.Entry<String, Integer> me : sme) {
			String s = me.getValue().toString() + me.getKey();
			sortSet.add(s);
		}
		// 获取后出现次数最多的index个单词，带有出现次数
		int o = sortSet.size();
		// 记数
		int c = 0;
		for (int i = o - index; i < sortSet.size();) {
			String te = sortSet.last();
			sortSet.remove(te);
			String temp = (o - sortSet.size()) + "：" + te.replaceAll("[^\\d]", "") + "  " + te.replaceAll("[\\d+]", "");
			re_str[c++] = temp;
		}
		return re_str;
	}

	public static void main(String[] args) {
		// TODO Auto-generated method stub
		String s = "Although it was autumn, the snow was already beginning to fall in Tibet. Our legs were so heavy and cold that they felt like blocks of ice. Have you ever seen snowmen ride bicycles? That's what we looked like! Along the way children dressed in long wool coats stopped to look at us. In the late afternoon we found it was so cold that our water bottles froze. However, the lakes shone like glass in the setting sun and looked wonderful. Wang Wei rode in front of me as usual. She is very reliable and I knew I didn't need to encourage her. To climb the mountains was hard work but as we looked around us, we were surprised by the view. We seemed to be able to see for miles. At one point we were so high that we found ourselves cycling through clouds. Then we began going down the hills. It was great fun especially as it gradually became much warmer. In the valleys colorful butterflies flew around us and we saw many yaks and sheep eating green grass. At this point we had to change our caps, coats, gloves and trousers for T-shirts and shorts.";
		String[] str = ChineseSearch.countToEng(s, 3);
		for (String t : str) {
			System.out.println(t);
		}
		System.out.println("---------------------");
		String s1 = "页的记录；另一种是每分一页就做一次查询，每次只查出当页需要的记录。第一种做法不是很可取，如果一张表中有上百万条记录的话，这无疑将会很慢，而实际项目中海量的数据是 无处不在的！所以比较好的方式是采用后种方法，所以这里就涉及到如何从数据库中得到指 定的记录？相比于mysql,sqlserver等数据库，oracle的这种查询语句相对复杂一点。 方法一（使用子查询）：rownum关键字在语法规定上不能使用大于一个数值的形式，所以 我们就要利用子查询来巧妙地实现，具体语句如下： ";
		String[] str1 = ChineseSearch.countToZH(s1, 3);
		for (String t : str1) {
			System.out.println(t);
		}
	}
}
